---
title: |
  | Analysis: Did the GDPR policy have an impact
  | attitudes towards data sharing/privacy?
author: 'Paul C. Bauer, Frederic Gerdon, Florian Keusch, Frauke Kreuter, David Vannette'
date:  |
  | `r gsub("^0", "", format(Sys.time(), "%d %B, %Y"))`
output:
  html_document:
    toc: true
    toc_depth: 2
---

<style>
/* Whole document: */
body{
  font-family: Georgia;
  font-size: 11pt;
  text-align: justify;
}
pre {
  white-space: pre !important;
  overflow-y: scroll !important;
}
</style>

```{r setup, include=FALSE}
knitr::opts_chunk$set(cache = FALSE)

# Packages
library(haven)
library(plotly)
library(dplyr)
library(stargazer)
library(readr)
library(stringr)
library(xtable)
library(kableExtra)
library(knitr)
library(tidyr)
library(broom)
library(gridExtra)
library(gtrendsR)
library(reshape2)
library(lubridate)
library(plm)
```




<br><br>


\clearpage

# Description of raw data
The raw data files that contain the survey panel data are the following:

* `data_project_700352_2018_04_23 - Final.csv`
* `data_project_727416_2018_08_02 - Final.csv`
* `data_project_738555_2018_11_08 - Final.csv`

The raw data files containing the open-ended responses coded in excel are the following:

* `Trust1_open-ends_code_new.csv`
* `Trust2_open-ends_code_new.csv`
* `Trust3_open-ends_code_new20.csv`

Reproduction files start with the data files after data management/recoding/anonymization steps.



<br><br><br>


# Data management: Raw data


## Data management: Wave 1

```{r importing-data-wave1, message=FALSE, warning=FALSE}
data <- read_delim("./input_raw/data_project_700352_2018_04_23 - Final.csv", delim = ";", locale = locale(encoding = "latin1"))

# N rows
table(data$dispcode, useNA = "always")
n.sample.original1 <- nrow(data)
n.rejectedquota1 <- as.numeric(table(data$dispcode)["Rejected (quota full) (36)"])
n.screenedout1 <- as.numeric(table(data$dispcode)["Screened out (37)"])
n.suspended1 <- as.numeric(table(data$dispcode)["Suspended (22)"])


# Filter complete answers
data <- data %>% filter(dispcode %in% c(
  "Completed (31)",
  "Completed after break (32)"
))

# Identify/delete duplicates after screening process
duplicateIDs <- data$p_0002[duplicated(data$p_0002)]
n.duplicated1 <- nrow(data %>% filter(p_0002 %in% duplicateIDs)) # N duplicate rows/observations
data <- data %>% filter(!p_0002 %in% duplicateIDs)

# N rows cleaned
n.sample.cleaned1 <- nrow(data)
```    
    

<br><br>


```{r renaming-vars-wave1, message=FALSE, warning=FALSE}

data <- rename(data,
  pid = p_0002,
  sex = v_99,
  age = v_101,
  state = v_100,
  school.education = v_62,
  highest.education = v_60,
  device.smartphone = v_11,
  device.handy = v_12,
  device.desktop = v_13,
  device.tablet = v_14,
  device.ereader = v_15,
  account.google = v_549,
  account.facebook = v_550,
  account.twitter = v_551,
  account.linkedin = v_552,
  account.xing = v_553,

  random_id = c_0002,

  trust.group1.google.norm = v_618,
  trust.group1.google.norm.why = v_574,
  trust.group1.facebook.norm = v_619,
  trust.group1.facebook.norm.why = v_576,
  trust.group1.fedoffstats.norm = v_620,
  trust.group1.fedoffstats.norm.why = v_578,
  trust.group1.researchers.norm = v_621,
  trust.group1.researchers.norm.why = v_580,

  trust.group2.google.norm = v_622,
  trust.group2.facebook.norm = v_624,
  trust.group2.fedoffstats.norm = v_626,
  trust.group2.researchers.norm = v_628,

  trust.group3.google.prob = v_630,
  trust.group3.google.prob.why = v_597,
  trust.group3.facebook.prob = v_631,
  trust.group3.facebook.prob.why = v_598,
  trust.group3.fedoffstats.prob = v_632,
  trust.group3.fedoffstats.prob.why = v_599,
  trust.group3.researchers.prob = v_633,
  trust.group3.researchers.prob.why = v_600,

  trust.group4.google.prob = v_634,
  trust.group4.facebook.prob = v_636,
  trust.group4.fedoffstats.prob = v_638,
  trust.group4.researchers.prob = v_640,

  trust.generalized = v_642,

  gdpr.know = v_604,

  gdpr.aims = v_605,

  privacy.hurt.google = v_81,
  privacy.hurt.facebook = v_82,
  privacy.hurt.authorities = v_83,
  privacy.hurt.researchers = v_84,

  privacy.worry = v_45
)

data <- data.frame(data)
```
  
  
  
```{r recoding-vars-wave1, message=FALSE, warning=FALSE}

# Age
data$age <- 2018 - data$age
data$age[data$age > 200] <- NA

# School education
data$school.education[data$school.education == "-77"] <- NA
data$school.education[data$school.education == "0"] <- NA

# Highest education
data$highest.education[data$highest.education == "-77"] <- NA
data$highest.education[data$highest.education == "0"] <- NA

# Gender
data$sex[data$sex == "-77"] <- NA
data$sex[data$sex == "0"] <- NA

# State
data$state[data$state == "---hier klicken---"] <- NA
data$state[data$state == "-77"] <- NA
data$state[data$state == "Baden-Württemberg"] <- "BW"
data$state[data$state == "Bayern"] <- "BY"
data$state[data$state == "Berlin"] <- "BE"
data$state[data$state == "Brandenburg"] <- "BB"
data$state[data$state == "Bremen"] <- "HB"
data$state[data$state == "Hamburg"] <- "HH"
data$state[data$state == "Hessen"] <- "HE"
data$state[data$state == "Mecklenburg-Vorpommern"] <- "MV"
data$state[data$state == "Niedersachsen"] <- "NI"
data$state[data$state == "Nordrhein-Westfalen"] <- "NW"
data$state[data$state == "Rheinland-Pfalz"] <- "RP"
data$state[data$state == "Saarland"] <- "SL"
data$state[data$state == "Sachsen"] <- "SN"
data$state[data$state == "Sachsen-Anhalt"] <- "ST"
data$state[data$state == "Schleswig-Holstein"] <- "SH"
data$state[data$state == "Thüringen"] <- "TH"



# Device Usage
vars <- c("device.smartphone", "device.handy", "device.desktop", "device.tablet", "device.ereader")
for (i in vars) {
  data[, i][data[, i] == "-66"] <- NA
  data[, i][data[, i] == "-77"] <- NA
  data[, i][data[, i] == "-99"] <- NA
  data[, i][data[, i] == "0"] <- NA
}


# Accounts
vars <- c("account.google", "account.facebook", "account.twitter", "account.linkedin", "account.xing")
for (i in vars) {
  data[, i][data[, i] == "-66"] <- NA
  data[, i][data[, i] == "-77"] <- NA
  data[, i][data[, i] == "-99"] <- NA
  data[, i][data[, i] == "0"] <- NA
  data[, i][data[, i] == "Nein, hatte nie einen Account"] <- "No, never"
  data[, i][data[, i] == "Nein, derzeit nicht, aber früher"] <- "No, but before"
  data[, i][data[, i] == "Ja, habe derzeit einen Account"] <- "Yes"
}





# STANDARD TRUST SCALES
vars <- c("trust.group1.google.norm", "trust.group1.facebook.norm", "trust.group1.fedoffstats.norm", "trust.group1.researchers.norm", "trust.group2.google.norm", "trust.group2.facebook.norm", "trust.group2.fedoffstats.norm", "trust.group2.researchers.norm")
data$trust.group1.google.norm.old <- data$trust.group1.google.norm
for (i in vars) {
  data[, i][data[, i] == "-66"] <- NA
  data[, i][data[, i] == "-77"] <- NA
  data[, i][data[, i] == "-99"] <- NA
  data[, i][data[, i] == "0"] <- NA
  data[, i][data[, i] == "0 überhaupt nicht"] <- "0"
  data[, i][data[, i] == "10 voll und ganz"] <- "10"
  data[, i] <- as.numeric(data[, i])
}

# Check
# table(data$trust.group1.google.norm)
# table(data$trust.group1.google.norm.old)


# STANDARD TRUST SCALES PROBING
vars <- c("trust.group1.google.norm.why", "trust.group1.facebook.norm.why", "trust.group1.fedoffstats.norm.why", "trust.group1.researchers.norm.why", "trust.group3.google.prob.why", "trust.group3.facebook.prob.why", "trust.group3.fedoffstats.prob.why", "trust.group3.researchers.prob.why")
for (i in vars) {
  data[, i][data[, i] == "-66"] <- NA
  data[, i][data[, i] == "-77"] <- NA
  data[, i][data[, i] == "-99"] <- NA

  # data[,i] <- as.character(data[,i])
  # data[,i] <- trimws(data[,i], which = c("right"))

  data[, paste(i, ".missing", sep = "")] <- is.na(data[, i])
}









# PROBABILITY TRUST SCALES
vars <- c("trust.group3.google.prob", "trust.group3.facebook.prob", "trust.group3.fedoffstats.prob", "trust.group3.researchers.prob", "trust.group4.google.prob", "trust.group4.facebook.prob", "trust.group4.fedoffstats.prob", "trust.group4.researchers.prob")
data$trust.group3.google.prob.old <- data$trust.group3.google.prob
for (i in vars) {
  data[, i][data[, i] == "-66"] <- NA
  data[, i][data[, i] == "-77"] <- NA
  data[, i][data[, i] == "-99"] <- NA
  data[, i][data[, i] == "0"] <- NA
  data[, i][data[, i] == "0% - sicher nicht"] <- "0%"
  data[, i][data[, i] == "100% - sicher"] <- "100%"
  data[, i] <- str_replace(data[, i], "%", "")
  data[, i] <- as.numeric(data[, i])
}
# table(data$trust.group3.google.prob.old)
# table(data$trust.group3.google.prob)


# Create trust variables (e.g. trust.google.norm) that combine all normal trust scales (normal and probingyes/no)
data$trust.google.norm <- ifelse(!is.na(data$trust.group1.google.norm),
  data$trust.group1.google.norm,
  data$trust.group2.google.norm
)
data$trust.facebook.norm <- ifelse(!is.na(data$trust.group1.facebook.norm),
  data$trust.group1.facebook.norm,
  data$trust.group2.facebook.norm
)
data$trust.researchers.norm <- ifelse(!is.na(data$trust.group1.researchers.norm),
  data$trust.group1.researchers.norm,
  data$trust.group2.researchers.norm
)
data$trust.fedoffstats.norm <- ifelse(!is.na(data$trust.group1.fedoffstats.norm),
  data$trust.group1.fedoffstats.norm,
  data$trust.group2.fedoffstats.norm
)
# Checks
# sum(!is.na(data$trust.google.norm))
# sum(!is.na(data$trust.group1.google.norm))
# sum(!is.na(data$trust.group2.google.norm))

# Create trust variables (e.g. trust.google.norm) that combine all prob trust scales (normal and probingyes/no)
data$trust.google.prob <- ifelse(!is.na(data$trust.group3.google.prob),
  data$trust.group3.google.prob,
  data$trust.group4.google.prob
)
data$trust.facebook.prob <- ifelse(!is.na(data$trust.group3.facebook.prob),
  data$trust.group3.facebook.prob,
  data$trust.group4.facebook.prob
)
data$trust.researchers.prob <- ifelse(!is.na(data$trust.group3.researchers.prob),
  data$trust.group3.researchers.prob,
  data$trust.group4.researchers.prob
)
data$trust.fedoffstats.prob <- ifelse(!is.na(data$trust.group3.fedoffstats.prob),
  data$trust.group3.fedoffstats.prob,
  data$trust.group4.fedoffstats.prob
)
# Checks
# sum(!is.na(data$trust.google.prob))
# sum(!is.na(data$trust.group3.google.prob))
# sum(!is.na(data$trust.group4.google.prob))

# Create dummy that indicate whether someone had a normal or prob. scale (in Wave 1) = Group 3 or 4
data$scale.type.wave1.prob <- !is.na(data$trust.google.prob)

# Create dummy that indicate whether someone had a normal or prob. scale (in Wave 1) = Group 1 or 2
data$scale.type.wave1.norm <- !is.na(data$trust.google.norm)


# Create trust variables that join normal and prob. scales

for (i in 1:nrow(data)) {
  data$trust.google[i] <- data$trust.group1.google.norm[i]
  if (is.na(data$trust.google[i] == TRUE)) {
    data$trust.google[i] <- data$trust.group2.google.norm[i]
  }
  if (is.na(data$trust.google[i] == TRUE)) {
    data$trust.google[i] <- data$trust.group3.google.prob[i] / 10
  }
  if (is.na(data$trust.google[i] == TRUE)) {
    data$trust.google[i] <- data$trust.group4.google.prob[i] / 10
  }
}

# ... for Facebook
for (i in 1:nrow(data)) {
  data$trust.facebook[i] <- data$trust.group1.facebook.norm[i]
  if (is.na(data$trust.facebook[i] == TRUE)) {
    data$trust.facebook[i] <- data$trust.group2.facebook.norm[i]
  }
  if (is.na(data$trust.facebook[i] == TRUE)) {
    data$trust.facebook[i] <- data$trust.group3.facebook.prob[i] / 10
  }
  if (is.na(data$trust.facebook[i] == TRUE)) {
    data$trust.facebook[i] <- data$trust.group4.facebook.prob[i] / 10
  }
}

# ... for researchers
for (i in 1:nrow(data)) {
  data$trust.researchers[i] <- data$trust.group1.researchers.norm[i]
  if (is.na(data$trust.researchers[i] == TRUE)) {
    data$trust.researchers[i] <- data$trust.group2.researchers.norm[i]
  }
  if (is.na(data$trust.researchers[i] == TRUE)) {
    data$trust.researchers[i] <- data$trust.group3.researchers.prob[i] / 10
  }
  if (is.na(data$trust.researchers[i] == TRUE)) {
    data$trust.researchers[i] <- data$trust.group4.researchers.prob[i] / 10
  }
}

# ... for Federal Statistical Office
for (i in 1:nrow(data)) {
  data$trust.fedoffstats[i] <- data$trust.group1.fedoffstats.norm[i]
  if (is.na(data$trust.fedoffstats[i] == TRUE)) {
    data$trust.fedoffstats[i] <- data$trust.group2.fedoffstats.norm[i]
  }
  if (is.na(data$trust.fedoffstats[i] == TRUE)) {
    data$trust.fedoffstats[i] <- data$trust.group3.fedoffstats.prob[i] / 10
  }
  if (is.na(data$trust.fedoffstats[i] == TRUE)) {
    data$trust.fedoffstats[i] <- data$trust.group4.fedoffstats.prob[i] / 10
  }
}

# Generalized trust
data$trust.generalized.old <- data$trust.generalized
data$trust.generalized[data$trust.generalized == "0"] <- NA
data$trust.generalized[data$trust.generalized == "-66"] <- NA
data$trust.generalized[data$trust.generalized == "-77"] <- NA
data$trust.generalized[data$trust.generalized == "-99"] <- NA
data$trust.generalized[data$trust.generalized == "0 - man kann nicht vorsichtig genug sein"] <- "0"
data$trust.generalized[data$trust.generalized == "10 - man kann den meisten Menschen vertrauen"] <- "10"
data$trust.generalized <- as.numeric(data$trust.generalized)

# Check
# table(data$trust.generalized)
# table(data$trust.generalized.old)

# Knowing GDPR
data$gdpr.know[data$gdpr.know == "-66"] <- NA
data$gdpr.know[data$gdpr.know == "-77"] <- NA
data$gdpr.know[data$gdpr.know == "-99"] <- NA
data$gdpr.know[data$gdpr.know == "0"] <- NA

# Aims of GDPR
data$gdpr.aims[data$gdpr.aims == "-66"] <- NA
data$gdpr.aims[data$gdpr.aims == "-77"] <- NA
data$gdpr.aims[data$gdpr.aims == "-99"] <- NA

# Privacy
vars <- c("privacy.hurt.google", "privacy.hurt.facebook", "privacy.hurt.authorities", "privacy.hurt.researchers", "privacy.worry")
for (i in vars) {
  data[, i][data[, i] == "-66"] <- NA
  data[, i][data[, i] == "-77"] <- NA
  data[, i][data[, i] == "-99"] <- NA
}
data$privacy.worry[data$privacy.worry == "0"] <- NA



data.wave1 <- data # For later!
rm(data) # delete data object so that there is no confusion
```


## Data management: Wave 2

```{r importing-data-wave2, message=FALSE, warning=FALSE}
data <- read_delim("./input_raw/data_project_727416_2018_08_02 - Final.csv", delim = ",", locale = locale(encoding = "latin1"))


# N rows
table(data$dispcode, useNA = "always")
n.sample.original2 <- nrow(data)
n.rejectedquota2 <- as.numeric(table(data$dispcode)["Rejected (quota full) (36)"])
n.screenedout2 <- as.numeric(table(data$dispcode)["Screened out (37)"])
n.suspended2 <- as.numeric(table(data$dispcode)["Suspended (22)"])



# Filter complete answers
data <- data %>% filter(dispcode %in% c(
  "Completed (31)",
  "Completed after break (32)"
))

# Identify/delete duplicates after screening process
duplicateIDs <- data$p_0002[duplicated(data$p_0002)]
n.duplicated2 <- nrow(data %>% filter(p_0002 %in% duplicateIDs)) # N duplicate rows/observations
data <- data %>% filter(!p_0002 %in% duplicateIDs)

# N rows cleaned
n.sample.cleaned2 <- nrow(data)
```    
    

<br><br>


```{r renaming-vars-wave2, message=FALSE, warning=FALSE}

data <- rename(data,
  pid = p_0002,
  sex = v_99,
  age = v_101,
  state = v_100,
  school.education = v_62,
  highest.education = v_60,
  device.smartphone = v_11,
  device.handy = v_12,
  device.desktop = v_13,
  device.tablet = v_14,
  device.ereader = v_15,
  account.google = v_549,
  account.facebook = v_550,
  account.twitter = v_551,
  account.linkedin = v_552,
  account.xing = v_553,

  random_id = c_0002,

  trust.google = v_618,

  trust.facebook = v_619,
  trust.fedoffstats = v_620,

  trust.researchers = v_621,


  trust.generalized = v_642,

  gdpr.know = v_604,

  gdpr.aims = v_605,

  privacy.worry = v_45,

  privacy.hurt.google = v_666,
  privacy.hurt.facebook = v_667,
  privacy.hurt.fedoffstats = v_668,
  privacy.hurt.researchers = v_669,

  facebook_data_name = v_671,
  facebook_data_email = v_672,
  facebook_data_address = v_673,
  facebook_data_birthdate = v_674,
  facebook_data_phone = v_675,
  facebook_data_income = v_677,
  facebook_data_mstatus = v_678,
  facebook_data_nchildren = v_679,
  facebook_data_location = v_681,
  facebook_data_browserhistory = v_682,
  facebook_data_otheraccounts = v_683,
  facebook_data_thirdpartydata = v_684,

  google_data_name = v_705,
  google_data_email = v_706,
  google_data_address = v_707,
  google_data_birthdate = v_708,
  google_data_phone = v_709,
  google_data_income = v_711,
  google_data_mstatus = v_712,
  google_data_nchildren = v_713,
  google_data_location = v_715,
  google_data_browserhistory = v_716,
  google_data_otheraccounts = v_717,
  google_data_thirdpartydata = v_718
)

data <- data.frame(data)
```
  
  
  
```{r recoding-vars-wave2, message=FALSE, warning=FALSE}

# Age
data$age <- 2018 - data$age
data$age[data$age > 200] <- NA

# School education
data$school.education[data$school.education == "-77"] <- NA
data$school.education[data$school.education == "0"] <- NA

# Highest education
data$highest.education[data$highest.education == "-77"] <- NA
data$highest.education[data$highest.education == "0"] <- NA

# Gender
data$sex[data$sex == "-77"] <- NA
data$sex[data$sex == "0"] <- NA

# State
data$state[data$state == "---hier klicken---"] <- NA
data$state[data$state == "-77"] <- NA
data$state[data$state == "Baden-Württemberg"] <- "BW"
data$state[data$state == "Bayern"] <- "BY"
data$state[data$state == "Berlin"] <- "BE"
data$state[data$state == "Brandenburg"] <- "BB"
data$state[data$state == "Bremen"] <- "HB"
data$state[data$state == "Hamburg"] <- "HH"
data$state[data$state == "Hessen"] <- "HE"
data$state[data$state == "Mecklenburg-Vorpommern"] <- "MV"
data$state[data$state == "Niedersachsen"] <- "NI"
data$state[data$state == "Nordrhein-Westfalen"] <- "NW"
data$state[data$state == "Rheinland-Pfalz"] <- "RP"
data$state[data$state == "Saarland"] <- "SL"
data$state[data$state == "Sachsen"] <- "SN"
data$state[data$state == "Sachsen-Anhalt"] <- "ST"
data$state[data$state == "Schleswig-Holstein"] <- "SH"
data$state[data$state == "Thüringen"] <- "TH"



# Device Usage
vars <- c("device.smartphone", "device.handy", "device.desktop", "device.tablet", "device.ereader")
for (i in vars) {
  data[, i][data[, i] == "-66"] <- NA
  data[, i][data[, i] == "-77"] <- NA
  data[, i][data[, i] == "-99"] <- NA
  data[, i][data[, i] == "0"] <- NA
}


# Accounts
vars <- c("account.google", "account.facebook", "account.twitter", "account.linkedin", "account.xing")
for (i in vars) {
  data[, i][data[, i] == "-66"] <- NA
  data[, i][data[, i] == "-77"] <- NA
  data[, i][data[, i] == "-99"] <- NA
  data[, i][data[, i] == "0"] <- NA
  data[, i][data[, i] == "Nein, hatte nie einen Account"] <- "No, never"
  data[, i][data[, i] == "Nein, derzeit nicht, aber früher"] <- "No, but before"
  data[, i][data[, i] == "Ja, habe derzeit einen Account"] <- "Yes"
}

# STANDARD TRUST SCALES
vars <- c("trust.google", "trust.facebook", "trust.fedoffstats", "trust.researchers")
data$trust.google.old <- data$trust.google
for (i in vars) {
  data[, i][data[, i] == "-66"] <- NA
  data[, i][data[, i] == "-77"] <- NA
  data[, i][data[, i] == "-99"] <- NA
  data[, i][data[, i] == "0"] <- NA
  data[, i][data[, i] == "0 überhaupt nicht"] <- "0"
  data[, i][data[, i] == "10 voll und ganz"] <- "10"
  data[, i] <- as.numeric(data[, i])
}

# Check
# table(data$trust.google)
# table(data$trust.google.old)











# Generalized trust
data$trust.generalized.old <- data$trust.generalized
data$trust.generalized[data$trust.generalized == "0"] <- NA
data$trust.generalized[data$trust.generalized == "-66"] <- NA
data$trust.generalized[data$trust.generalized == "-77"] <- NA
data$trust.generalized[data$trust.generalized == "-99"] <- NA
data$trust.generalized[data$trust.generalized == "0 - man kann nicht vorsichtig genug sein"] <- "0"
data$trust.generalized[data$trust.generalized == "10 - man kann den meisten Menschen vertrauen"] <- "10"
data$trust.generalized <- as.numeric(data$trust.generalized)

# Check
# table(data$trust.generalized)
# table(data$trust.generalized.old)



# Knowing GDPR
data$gdpr.know[data$gdpr.know == "-66"] <- NA
data$gdpr.know[data$gdpr.know == "-77"] <- NA
data$gdpr.know[data$gdpr.know == "-99"] <- NA
data$gdpr.know[data$gdpr.know == "0"] <- NA

# Aims of GDPR
data$gdpr.aims[data$gdpr.aims == "-66"] <- NA
data$gdpr.aims[data$gdpr.aims == "-77"] <- NA
data$gdpr.aims[data$gdpr.aims == "-99"] <- NA


# Privacy
vars <- c("privacy.hurt.google", "privacy.hurt.facebook", "privacy.hurt.fedoffstats", "privacy.hurt.researchers", "privacy.worry")
for (i in vars) {
  data[, i][data[, i] == "-66"] <- NA
  data[, i][data[, i] == "-77"] <- NA
  data[, i][data[, i] == "-99"] <- NA
}
data$privacy.worry[data$privacy.worry == "0"] <- NA

#



vars <- c(
  "facebook_data_name", "facebook_data_email", "facebook_data_address",
  "facebook_data_birthdate", "facebook_data_phone", "facebook_data_income",
  "facebook_data_mstatus", "facebook_data_nchildren", "facebook_data_location",
  "facebook_data_browserhistory", "facebook_data_otheraccounts",
  "facebook_data_thirdpartydata", "google_data_name", "google_data_email",
  "google_data_address", "google_data_birthdate", "google_data_phone",
  "google_data_income", "google_data_mstatus", "google_data_nchildren",
  "google_data_location", "google_data_browserhistory", "google_data_otheraccounts",
  "google_data_thirdpartydata"
)
for (i in vars) {
  data[, i][data[, i] == "-66"] <- NA
  data[, i][data[, i] == "-77"] <- NA
  data[, i][data[, i] == "-99"] <- NA
  data[, i][data[, i] == "0"] <- NA
  data[, i][data[, i] == "Ja, sollten gespeichert werden dürfen"] <- "1"
  data[, i][data[, i] == "Nein, sollten nicht gespeichert werden dürfen"] <- "0"
  data[, i] <- as.numeric(data[, i])
  # print(table(data[,i]))
}


data.wave2 <- data # For later!
rm(data) # delete data object so that there is no confusion
```

## Data management: Wave 3

```{r importing-data-wave3, message=FALSE, warning=FALSE}
data <- read_delim("./input_raw/data_project_738555_2018_11_08 - Final.csv", delim = ",", locale = locale(encoding = "latin1"))
data <- data %>% filter(!dispcode=="Not yet started (20)") # Non starters don't even count into the started questionnaire category

# N rows
table(data$dispcode, useNA = "always")
n.sample.original3 <- nrow(data)
n.rejectedquota3 <- as.numeric(table(data$dispcode)["Rejected (quota full) (36)"])
n.screenedout3 <- as.numeric(table(data$dispcode)["Screened out (37)"])
n.suspended3 <- as.numeric(table(data$dispcode)["Suspended (22)"])


# Filter complete answers
data <- data %>% filter(dispcode %in% c(
  "Completed (31)",
  "Completed after break (32)"
))



# Identify/delete duplicates after screening process
# sum(duplicated(data$p_0002)) -> give 8 duplicates (could be same id several times)
duplicateIDs <- data$p_0002[duplicated(data$p_0002)]
n.duplicated3 <- nrow(data %>% filter(p_0002 %in% duplicateIDs)) # N duplicate rows/observations
data <- data %>% filter(!p_0002 %in% duplicateIDs)

# N rows cleaned
n.sample.cleaned3 <- nrow(data)
```    
    

<br><br>


```{r renaming-vars-wave3, message=FALSE, warning=FALSE}

data <- rename(data,
  pid = p_0002,
  sex = v_99,
  age = v_101,
  state = v_100,
  school.education = v_62,
  highest.education = v_60,
  device.smartphone = v_11,
  device.handy = v_12,
  device.desktop = v_13,
  device.tablet = v_14,
  device.ereader = v_15,
  account.google = v_549,
  account.facebook = v_550,
  account.twitter = v_551,
  account.linkedin = v_552,
  account.xing = v_553,

  random_id = c_0002,

  trust.google = v_618, # Trust measures before survey experiment
  trust.facebook = v_619,
  trust.fedoffstats = v_620,
  trust.researchers = v_621,


  trust.generalized = v_642,
  gdpr.know = v_604,
  gdpr.aims = v_605,
  privacy.worry = v_45,


  # Here after v_45 the experiment starts
  # Randomization variable (is defined earlier)
  randomization_gdprinfo = c_0002,

  text.comprehensible = v_725,

  smartphone.howmany = v_726,

  trust.google.postexp = v_719,
  trust.facebook.postexp = v_720,
  trust.generalized.postexp = v_724,

  feeling.google.share.research.postexp = v_727,
  feeling.facebook.share.research.postexp = v_739,
  feeling.fedoffstats.share.research.postexp = v_740,

  feeling.google.share.commercial.postexp = v_741,
  feeling.facebook.share.commercial.postexp = v_742,
  feeling.fedoffstats.share.commercial.postexp = v_743,

  rts.gdpr = rts4942791,
  rts.passengers = rts4942793,
  rts.previouspage = rts4942789
)

data <- data.frame(data)
```
  
  
  
```{r recoding-vars-wave3, message=FALSE, warning=FALSE}

# Age
data$age <- 2018 - data$age
data$age[data$age > 200] <- NA

# School education
data$school.education[data$school.education == "-77"] <- NA
data$school.education[data$school.education == "0"] <- NA

# Highest education
data$highest.education[data$highest.education == "-77"] <- NA
data$highest.education[data$highest.education == "0"] <- NA

# Gender
data$sex[data$sex == "-77"] <- NA
data$sex[data$sex == "0"] <- NA

# State
data$state[data$state == "---hier klicken---"] <- NA
data$state[data$state == "-77"] <- NA
data$state[data$state == "Baden-Württemberg"] <- "BW"
data$state[data$state == "Bayern"] <- "BY"
data$state[data$state == "Berlin"] <- "BE"
data$state[data$state == "Brandenburg"] <- "BB"
data$state[data$state == "Bremen"] <- "HB"
data$state[data$state == "Hamburg"] <- "HH"
data$state[data$state == "Hessen"] <- "HE"
data$state[data$state == "Mecklenburg-Vorpommern"] <- "MV"
data$state[data$state == "Niedersachsen"] <- "NI"
data$state[data$state == "Nordrhein-Westfalen"] <- "NW"
data$state[data$state == "Rheinland-Pfalz"] <- "RP"
data$state[data$state == "Saarland"] <- "SL"
data$state[data$state == "Sachsen"] <- "SN"
data$state[data$state == "Sachsen-Anhalt"] <- "ST"
data$state[data$state == "Schleswig-Holstein"] <- "SH"
data$state[data$state == "Thüringen"] <- "TH"

# Device Usage
vars <- c("device.smartphone", "device.handy", "device.desktop", "device.tablet", "device.ereader")
for (i in vars) {
  data[, i][data[, i] == "-66"] <- NA
  data[, i][data[, i] == "-77"] <- NA
  data[, i][data[, i] == "-99"] <- NA
  data[, i][data[, i] == "0"] <- NA
}


# Accounts
vars <- c("account.google", "account.facebook", "account.twitter", "account.linkedin", "account.xing")
for (i in vars) {
  data[, i][data[, i] == "-66"] <- NA
  data[, i][data[, i] == "-77"] <- NA
  data[, i][data[, i] == "-99"] <- NA
  data[, i][data[, i] == "0"] <- NA
  data[, i][data[, i] == "Nein, hatte nie einen Account"] <- "No, never"
  data[, i][data[, i] == "Nein, derzeit nicht, aber früher"] <- "No, but before"
  data[, i][data[, i] == "Ja, habe derzeit einen Account"] <- "Yes"
}





# STANDARD TRUST SCALES
vars <- c("trust.google", "trust.facebook", "trust.fedoffstats", "trust.researchers")
data$trust.google.old <- data$trust.google
for (i in vars) {
  data[, i][data[, i] == "-66"] <- NA
  data[, i][data[, i] == "-77"] <- NA
  data[, i][data[, i] == "-99"] <- NA
  data[, i][data[, i] == "0"] <- NA
  data[, i][data[, i] == "0 überhaupt nicht"] <- "0"
  data[, i][data[, i] == "10 voll und ganz"] <- "10"
  data[, i] <- as.numeric(data[, i])
}


# Trust after (non-)treatment
vars <- c("trust.google.postexp", "trust.facebook.postexp")
data$trust.google.postexp.old <- data$trust.google.postexp
for (i in vars) {
  data[, i][data[, i] == "-66"] <- NA
  data[, i][data[, i] == "-77"] <- NA
  data[, i][data[, i] == "-99"] <- NA
  data[, i][data[, i] == "0"] <- NA
  data[, i][data[, i] == "0 überhaupt nicht"] <- "0"
  data[, i][data[, i] == "10 voll und ganz"] <- "10"
  data[, i] <- as.numeric(data[, i])
}

# Check
# table(data$trust.google)
# table(data$trust.google.old)

# Check
# table(data$trust.google.postexp)
# table(data$trust.google.postexp.old)




# Generalized trust
data$trust.generalized.old <- data$trust.generalized
data$trust.generalized[data$trust.generalized == "0"] <- NA
data$trust.generalized[data$trust.generalized == "-66"] <- NA
data$trust.generalized[data$trust.generalized == "-77"] <- NA
data$trust.generalized[data$trust.generalized == "-99"] <- NA
data$trust.generalized[data$trust.generalized == "0 - man kann nicht vorsichtig genug sein"] <- "0"
data$trust.generalized[data$trust.generalized == "10 - man kann den meisten Menschen vertrauen"] <- "10"
data$trust.generalized <- as.numeric(data$trust.generalized)


# Check
# table(data$trust.generalized)
# table(data$trust.generalized.old)


# Generalized trust after treatment
data$trust.generalized.postexp.old <- data$trust.generalized.postexp
data$trust.generalized.postexp[data$trust.generalized.postexp == "-66"] <- NA
data$trust.generalized.postexp[data$trust.generalized.postexp == "-77"] <- NA
data$trust.generalized.postexp[data$trust.generalized.postexp == "-99"] <- NA
data$trust.generalized.postexp[data$trust.generalized.postexp == "0 - man kann nicht vorsichtig genug sein"] <- "0"
data$trust.generalized.postexp[data$trust.generalized.postexp == "10 - man kann den meisten Menschen vertrauen"] <- "10"
data$trust.generalized.postexp <- as.numeric(data$trust.generalized.postexp)

# Check
# table(data$trust.generalized.postexp)
# table(data$trust.generalized.postexp.old)



# Knowing GDPR
data$gdpr.know[data$gdpr.know == "-66"] <- NA
data$gdpr.know[data$gdpr.know == "-77"] <- NA
data$gdpr.know[data$gdpr.know == "-99"] <- NA
data$gdpr.know[data$gdpr.know == "0"] <- NA

# Aims of GDPR
data$gdpr.aims[data$gdpr.aims == "-66"] <- NA
data$gdpr.aims[data$gdpr.aims == "-77"] <- NA
data$gdpr.aims[data$gdpr.aims == "-99"] <- NA


# Privacy
vars <- c("privacy.hurt.google", "privacy.hurt.facebook", "privacy.hurt.fedoffstats", "privacy.hurt.researchers", "privacy.worry")
data$privacy.worry[data$privacy.worry == "0"] <- NA


# Feeling about Google, Facebook, FedOffStats, after treatment
vars <- c(
  "feeling.google.share.research.postexp", "feeling.facebook.share.research.postexp", "feeling.fedoffstats.share.research.postexp",
  "feeling.google.share.commercial.postexp", "feeling.facebook.share.commercial.postexp", "feeling.fedoffstats.share.commercial.postexp"
)
data$feeling.google.share.research.postexp.old <- data$feeling.google.share.research.postexp
for (i in vars) {
  data[, i][data[, i] == "-77"] <- NA
  data[, i][data[, i] == "0"] <- NA
  data[, i][data[, i] == "Sehr unwohl"] <- "0"
  data[, i][data[, i] == "Etwas unwohl"] <- "1"
  data[, i][data[, i] == "Weder unwohl noch wohl"] <- "2"
  data[, i][data[, i] == "Etwas wohl"] <- "3"
  data[, i][data[, i] == "Sehr wohl"] <- "4"
  data[, i] <- as.numeric(data[, i])
}

# Check
# table(data$feeling.google.share.research.postexp.old)
# table(data$feeling.google.share.research.postexp)


# EXPERIMENT RANDOMIZATION
data$randomization_gdprinfo_str <- data$randomization_gdprinfo

data$randomization_gdprinfo_str <- recode(data$randomization_gdprinfo_str,
  "1" = "GDPR",
  "2" = "GDPR",
  "3" = "PassRights",
  "4" = "None"
)

data$randomization_gdprinfo_fac <- as.factor(data$randomization_gdprinfo_str)


# Create change score variables  for outcomes
data$trust.google.change <- data$trust.google.postexp - data$trust.google
data$trust.facebook.change <- data$trust.facebook.postexp - data$trust.facebook
data$trust.generalized.change <- data$trust.generalized.postexp - data$trust.generalized


data.wave3 <- data # For later!
rm(data) # delete data object so that there is no confusion
```


# Table A1: Sample stats for single waves

```{r table-A1}
# SAMPLE STATS FROM THE SINGLE DATASETS
n.sample.original <- n.sample.original1 + n.sample.original2 + n.sample.original3
n.rejectedquota <- n.rejectedquota1 + n.rejectedquota2 + n.rejectedquota3
n.screenedout <- n.screenedout1 + n.screenedout2 + n.screenedout3
n.suspended <- n.suspended1 + n.suspended2 + n.suspended3
n.duplicated <- n.duplicated1 + n.duplicated2 + n.duplicated3
n.sample.cleaned <- n.sample.cleaned1 + n.sample.cleaned2 + n.sample.cleaned3



table.sample.stats <- data.frame(
  Wave = c("Wave 1", "Wave 2", "Wave 3", "Sum of observations (across all waves)"),
  "Started.questionnaire" = c(n.sample.original1, n.sample.original2, n.sample.original3, n.sample.original),
  "Rejected" = c(n.rejectedquota1, n.rejectedquota2, n.rejectedquota3, n.rejectedquota),
  "Screened.out" = c(n.screenedout1, n.screenedout2, n.screenedout3, n.screenedout),
  "Suspended" = c(n.suspended1, n.suspended2, n.suspended3, n.suspended),
  "Duplicated" = c(n.duplicated1, n.duplicated2, n.duplicated3, n.duplicated),
  "Sample.size.cleaned" = c(n.sample.cleaned1, n.sample.cleaned2, n.sample.cleaned3, n.sample.cleaned))
rownames(table.sample.stats) <- NULL
names(table.sample.stats) <- gsub("\\.", " ", names(table.sample.stats))
table.sample.stats <- table.sample.stats[-4,]


table <- kable(table.sample.stats,
  row.names = FALSE,
  caption = "Sample statistics", format = "html", booktabs = T
) %>%
  kable_styling(full_width = T, font_size = 12) %>%
  footnote(general = "Respondents who started the survey, were discarded because they either did not fulfill quota criteria (rejected), were screened out due to not living in Germany or being under the age of 18 (screened out), broke off the survey (suspended), or filled out the questionnaire multiple times (duplicated). Duplicates were omitted from the data after screening the data for the other criteria (rejected, screened out, suspended).")
table
save_kable(table, "./tableA1.html", self_contained = TRUE)
```


```{r create-longformat-dataset, message=FALSE, warning=FALSE}

# Add wave indicators to datasets of each wave
data.wave1 <- data.wave1 %>% mutate(wave = "Wave 1")
data.wave2 <- data.wave2 %>% mutate(wave = "Wave 2")
data.wave3 <- data.wave3 %>% mutate(wave = "Wave 3")

# Subset datasets of each wave
data.wave1 <- data.wave1 %>% select(
  pid, wave, age, contains("education"), sex, state, contains("scale.type"),
  gdpr.know, gdpr.aims, contains("account"),
  contains("trust"), contains("privacy"), contains("device"), rts4650338, rts4727492, rts4727550, rts4727617,
  rts4727664, rts4727802, rts4727804, rts4727811, rts4727836, random_id,
  -contains("why"),
  -contains(".group"), -contains(".norm"),
  -contains("old"), -contains("google.prob"), -contains("facebook.prob"),
  -contains("researchers.prob"), -contains("fedoffstats.prob")
)

data.wave2 <- data.wave2 %>% select(
  pid, wave, age, contains("education"), sex, state,
  gdpr.know, gdpr.aims, duration, contains("account"),
  contains("device"),
  contains("trust"), contains("privacy"),
  -contains("old")
)
data.wave3 <- data.wave3 %>% select(
  pid, wave, age, contains("education"), sex, state,
  gdpr.know, gdpr.aims, contains("account"),
  contains("device"), duration,
  contains("trust"), contains("privacy"),
  contains("feeling"), rts.gdpr, rts.passengers,
  rts.previouspage, contains("randomization_gdprinfo"),
  -contains("old"), text.comprehensible
)

# bind rows to create longformat dataset
data <- bind_rows(data.wave1, data.wave2, data.wave3)
# rm(data.wave1, data.wave2, data.wave3)
# nrow(data)

# Keep only pids that are present in all three waves (for experiment we need the complete Wave 3)
# data <- data %>% filter(pid %in% intersect(data.wave1$pid, data.wave2$pid)) # , data.wave3$pid

# Create numeric wave variable
data$wave.num <- NA
data$wave.num[data$wave == "Wave 1"] <- 1
data$wave.num[data$wave == "Wave 2"] <- 2
data$wave.num[data$wave == "Wave 3"] <- 3

# store open-ended questions for assessment
# write_excel_csv(data %>% select(wave.num, gdpr.know, gdpr.aims), "gdprknow.csv")
```



```{r more-varables-all-waves, echo=TRUE, message=FALSE, warning=FALSE}

# Create Wave dummies
data$wave1 <- 0
data$wave1[data$wave.num == 1] <- 1

data$wave2 <- 0
data$wave2[data$wave.num == 2] <- 1

data$wave3 <- 0
data$wave3[data$wave.num == 3] <- 1

# Create numeric variable for gdpr.know
data$gdpr.know.num <- NA
data$gdpr.know.num[data$gdpr.know == "Nein"] <- 0
data$gdpr.know.num[data$gdpr.know == "Ja"] <- 1

# Create numeric variable for sex
data$female <- NA
data$female[data$sex == "männlich"] <- 0
data$female[data$sex == "weiblich"] <- 1


# DID: privacy worry. Before, convert to numeric variable (via factoring)
data$privacy.worry.factor <- factor(data$privacy.worry)
data$privacy.worry.factor <- factor(
  data$privacy.worry.factor,
  levels(data$privacy.worry.factor)[c(3, 4, 1, 2)]
)
data$privacy.worry.numeric <- as.numeric(data$privacy.worry.factor)
data$privacy.worry.numeric <- data$privacy.worry.numeric - 1
# table(data$privacy.worry.factor, data$privacy.worry.numeric)



# DID models: privacy hurt. Before, convert to numeric variables (via factoring)
# Recoding: 0 = No, 1 = Yes
data$privacy.hurt.google.numeric <- NA
data$privacy.hurt.google.numeric[data$privacy.hurt.google == "Nein"] <- 0
data$privacy.hurt.google.numeric[data$privacy.hurt.google == "Ja"] <- 1

data$privacy.hurt.facebook.numeric <- NA
data$privacy.hurt.facebook.numeric[data$privacy.hurt.facebook == "Nein"] <- 0
data$privacy.hurt.facebook.numeric[data$privacy.hurt.facebook == "Ja"] <- 1

data$privacy.hurt.researchers.numeric <- NA
data$privacy.hurt.researchers.numeric[data$privacy.hurt.researchers == "Nein"] <- 0
data$privacy.hurt.researchers.numeric[data$privacy.hurt.researchers == "Ja"] <- 1

data$privacy.hurt.authorities.numeric <- NA
data$privacy.hurt.authorities.numeric[data$privacy.hurt.authorities == "Nein"] <- 0
data$privacy.hurt.authorities.numeric[data$privacy.hurt.authorities == "Ja"] <- 1


## Creation of several education variables, ordinal/dichotomous
# Check this

# Transform school.education to ordinal variable, excluding "others"
data$school.educ.num <- NA
data$school.educ.num[data$school.education == "Abitur bzw. Erweiterte Oberschule mit Abschluss 12. Klasse (Hochschulreife)"] <- 6
data$school.educ.num[data$school.education == "Fachhochschulreife (Abschluss einer Fachoberschule etc.)"] <- 5
data$school.educ.num[data$school.education == "Mittlere Reife, Realschulabschluss bzw. Polytechnische Oberschule mit Abschluss 10. Klasse"] <- 4
data$school.educ.num[data$school.education == "Volks-/Hauptschulabschluss bzw. Polytechnische Oberschule mit Abschluss 8. oder 9. Klasse"] <- 3
data$school.educ.num[data$school.education == "Schule beendet ohne Abschluss"] <- 2
data$school.educ.num[data$school.education == "Noch Schüler/-in"] <- 1
data$school.educ.num[is.na(data$school.education) == TRUE] <- NA

# Check
# table(data$school.educ.num, data$school.education)
# 56 instead of 51 NAs? Counts for values 1 to 6 are correct

# Reducing the obtained variable to 4 categories
data$school.educ.num4 <- data$school.educ.num
data$school.educ.num4[data$school.educ.num == 1 | data$school.educ.num == 2] <- NA
data$school.educ.num4 <- data$school.educ.num4 - 2

# Check
# table(data$school.educ.num, data$school.educ.num4)

# Dichotomous variable indicating having Abitur or not.
data$abitur <- NA
data$abitur[data$school.educ.num == 6] <- 1
data$abitur[data$school.educ.num < 6 & is.na(data$school.educ.num) == FALSE] <- 0

# Check
# table(data$school.educ.num, data$abitur)
# 1s ("Noch Sch?ler/-in" are missing)

# Ordinal variable indicating having no Abitur, Abitur, or Abitur and a tertiary degree #(university or "Fachhochschule").
data$abi.uni <- NA
data$abi.uni[data$school.educ.num < 5 & is.na(data$school.educ.num) == FALSE] <- 1
data$abi.uni[data$school.educ.num == 5 | data$school.educ.num == 6] <- 2
data$abi.uni[(data$school.educ.num == 5 | data$school.educ.num == 6) &
  (data$highest.education == "Bachelor an (Fach-)Hochschule abgeschlossen" |
    data$highest.education == "Fachhochschulabschluss (z. B. Diplom, Master)" |
    data$highest.education == "Promotion" |
    data$highest.education == "Universitätsabschluss (z. B. Diplom, Magister, Staatsexamen, Master)")] <- 3

# Check
# table(data$school.educ.num, data$abi.uni)

# Dichotomous variable indicating having a tertiary degree (university or "Fachhochschule") # or not.
data$university.degree <- NA
data$university.degree[data$abi.uni == 2 | data$abi.uni == 3] <- 1
data$university.degree[data$abi.uni == 1] <- 0

# Check
# table(data$university.degree, data$abi.uni)


# Education (4 categories) as factor
data$school.educ.num4.fact <- factor(data$school.educ.num4)
levels(data$school.educ.num4.fact) <- c("Hauptschule", "Mittlere Reife", "Fachhochschulreife", "Abitur")


# Accounts:
# Dummies:
data$account.google.dummy <- NA
data$account.google.dummy[data$account.google == "Yes"] <- 1
data$account.google.dummy[data$account.google == "No, but before" |
  data$account.google == "No, never" ] <- 0

data$account.facebook.dummy <- NA
data$account.facebook.dummy[data$account.facebook == "Yes"] <- 1
data$account.facebook.dummy[data$account.facebook == "No, but before" |
  data$account.facebook == "No, never" ] <- 0

data$account.twitter.dummy <- NA
data$account.twitter.dummy[data$account.twitter == "Yes"] <- 1
data$account.twitter.dummy[data$account.twitter == "No, but before" |
  data$account.twitter == "No, never"] <- 0

data$account.linkedin.dummy <- NA
data$account.linkedin.dummy[data$account.linkedin == "Yes"] <- 1
data$account.linkedin.dummy[data$account.linkedin == "No, but before" |
  data$account.linkedin == "No, never"] <- 0

data$account.xing.dummy <- NA
data$account.xing.dummy[data$account.xing == "Yes"] <- 1
data$account.xing.dummy[data$account.xing == "No, but before" |
  data$account.xing == "No, never"] <- 0

# Index:
# (Note: If one of the dummy variables is NA, number.accounts will also be NA! Possible
# solution: Code NAs to 0)
for (i in 1:nrow(data)) {
  data$number.accounts[i] <- data$account.google.dummy[i] +
    data$account.facebook.dummy[i] + data$account.twitter.dummy[i] +
    data$account.linkedin.dummy[i] + data$account.xing.dummy[i]
}
table(data$number.accounts, useNA="always")


# Number devices:
# Dummies:
data$device.smartphone.dummy <- NA
data$device.smartphone.dummy[data$device.smartphone == "ja"] <- 1
data$device.smartphone.dummy[data$device.smartphone == "nein"] <- 0

data$device.handy.dummy <- NA
data$device.handy.dummy[data$device.handy == "ja"] <- 1
data$device.handy.dummy[data$device.handy == "nein"] <- 0

data$device.desktop.dummy <- NA
data$device.desktop.dummy[data$device.desktop == "ja"] <- 1
data$device.desktop.dummy[data$device.desktop == "nein"] <- 0

data$device.tablet.dummy <- NA
data$device.tablet.dummy[data$device.tablet == "ja"] <- 1
data$device.tablet.dummy[data$device.tablet == "nein"] <- 0

data$device.ereader.dummy <- NA
data$device.ereader.dummy[data$device.ereader == "ja"] <- 1
data$device.ereader.dummy[data$device.ereader == "nein"] <- 0


# Index:
# (Note: If one of the dummy variables is NA, number.accounts will also be NA! Possible
# solution: Code NAs to 0)
for (i in 1:nrow(data)) {
  data$number.devices[i] <- data$device.smartphone.dummy[i] +
    data$device.handy.dummy[i] + data$device.desktop.dummy[i] +
    data$device.tablet.dummy[i] + data$device.ereader.dummy[i]
}
table(data$device.smartphone.dummy, useNA="always")
table(data$device.handy.dummy, useNA="always")
table(data$device.desktop.dummy, useNA="always")
table(data$device.tablet.dummy, useNA="always")
table(data$data$device.ereader.dummy, useNA="always")
table(data$number.devices, useNA="always")


# Time spent on Info page
data$time.info <- NA
for (i in 1:nrow(data)) {
  data$time.info[i][is.na(data$rts.gdpr[i]) == FALSE & data$rts.gdpr[i] != 0] <-
    (data$rts.gdpr[i] - data$rts.previouspage[i])
  data$time.info[i][is.na(data$rts.passengers[i]) == FALSE & data$rts.passengers[i] != 0] <-
    (data$rts.passengers[i] - data$rts.previouspage[i])
}

# GDPR: 202 words; Passengers: 212 words; threshold 0.3sec/word
# Calculating threshold:
# GDPR: 202*0.3 = 60.6
# Passengers: 212*0.3 = 63.6
data$speeding <- NA
data$speeding[data$time.info <= 60 & data$randomization_gdprinfo_str == "GDPR"] <- 1
data$speeding[data$time.info > 60 & data$randomization_gdprinfo_str == "GDPR"] <- 0

data$speeding[data$time.info <= 63 & data$randomization_gdprinfo_str == "PassRights"] <- 1
data$speeding[data$time.info > 63 & data$randomization_gdprinfo_str == "PassRights"] <- 0
# table(data$speeding)

# Further/different control variables for Table 1
# Age
data$age.cat <- NA
data$age.cat[data$age < 30] <- 1
data$age.cat[data$age >= 30 & data$age <= 49] <- 2
data$age.cat[data$age >= 50] <- 3

data$age.cat.fac <- factor(data$age.cat,
  levels = c(1, 2, 3),
  labels = c("below30", "30to50", "above50")
)
data$age.cat.fac <- relevel(data$age.cat.fac, "below30")

# FB and Google accounts
data$account.facebook.fac <- factor(data$account.facebook)
data$account.google.fac <- factor(data$account.google)

data$account.facebook.fac <- relevel(data$account.facebook.fac, "No, never")
data$account.google.fac <- relevel(data$account.google.fac, "No, never")


# Add variable indicating "newness"/uniqueness of respondent
data$enterwave[data$wave.num == 1] <- 1

for (i in 1:nrow(data)) {
  if (data$wave.num[i] == 2 & data$pid[i] %in% data[1:i - 1, ]$pid == FALSE) {
    data$enterwave[i] <- 2
  }
  if (data$wave.num[i] == 3 & data$pid[i] %in% data[1:i - 1, ]$pid == FALSE) {
    data$enterwave[i] <- 3
  }
}

data$newness <- 0
data$newness[is.na(data$enterwave) == FALSE] <- 1
```



```{r add-refined-treatment-variable, include=FALSE}
# Wave 1
open.gdpr.w1 <- read.csv2("./input_raw/Trust1_open-ends_code_new.csv")
open.gdpr.w1 <- select(open.gdpr.w1, p_0002, open.gdpr.1, open.gdpr.2, open.gdpr.3, open.gdpr.4) %>% mutate(wave.num = 1) %>% rename(pid = "p_0002") %>% distinct()
data <- left_join(data, open.gdpr.w1,
  by = c("pid", "wave.num")
)

# Wave 2
open.gdpr.w2 <- read.csv2("./input_raw/Trust2_open-ends_code_new.csv")
open.gdpr.w2 <- select(open.gdpr.w2, p_0002, open.gdpr.1, open.gdpr.2, open.gdpr.3, open.gdpr.4) %>% mutate(wave.num = 2) %>% rename(pid = "p_0002") %>% distinct()
data <- left_join(data, open.gdpr.w2,
  by = c("pid", "wave.num")
)

# Wave 3
open.gdpr.w3 <- read.csv2("./input_raw/Trust3_open-ends_code_new20.csv")
open.gdpr.w3 <- select(open.gdpr.w3, p_0002, open.gdpr.1, open.gdpr.2, open.gdpr.3, open.gdpr.4) %>% mutate(wave.num = 3) %>% rename(pid = "p_0002") %>% distinct()
data <- left_join(data, open.gdpr.w3,
  by = c("pid", "wave.num")
)

# Move all values to same same columns
# To the columns open.gdpr.1 - open.gdpr.4
for (i in 1:nrow(data)) {
  if (is.na(data$open.gdpr.1.x[i]) == FALSE) {
    data$open.gdpr.1[i] <- data$open.gdpr.1.x[i]
  }
  if (is.na(data$open.gdpr.2.x[i]) == FALSE) {
    data$open.gdpr.2[i] <- data$open.gdpr.2.x[i]
  }
  if (is.na(data$open.gdpr.3.x[i]) == FALSE) {
    data$open.gdpr.3[i] <- data$open.gdpr.3.x[i]
  }
  if (is.na(data$open.gdpr.4.x[i]) == FALSE) {
    data$open.gdpr.4[i] <- data$open.gdpr.4.x[i]
  }
  if (is.na(data$open.gdpr.1.y[i]) == FALSE) {
    data$open.gdpr.1[i] <- data$open.gdpr.1.y[i]
  }
  if (is.na(data$open.gdpr.2.y[i]) == FALSE) {
    data$open.gdpr.2[i] <- data$open.gdpr.2.y[i]
  }
  if (is.na(data$open.gdpr.3.y[i]) == FALSE) {
    data$open.gdpr.3[i] <- data$open.gdpr.3.y[i]
  }
  if (is.na(data$open.gdpr.4.y[i]) == FALSE) {
    data$open.gdpr.4[i] <- data$open.gdpr.4.y[i]
  }
}

data <- data %>% select(
  -open.gdpr.1.x,
  -open.gdpr.1.y,
  -open.gdpr.2.x,
  -open.gdpr.2.y,
  -open.gdpr.3.x,
  -open.gdpr.3.y,
  -open.gdpr.4.x,
  -open.gdpr.4.y
) # Removing redundant columns

data$open.gdpr.1[data$open.gdpr.1 == 22] <- 2 # Removing coding mistake


## Add treatment variable based also on answers on open questions
data$gdpr.know.new <- NA
data$gdpr.know.new[(data$open.gdpr.1 %in% c(1:4) == TRUE | data$open.gdpr.1 %in% c(9:17, 20)
== TRUE |
  data$open.gdpr.2 %in% c(1:4) == TRUE | data$open.gdpr.2 %in% c(9:17, 20)
== TRUE |
  data$open.gdpr.3 %in% c(1:4) == TRUE | data$open.gdpr.3 %in% c(9:17, 20)
== TRUE |
  data$open.gdpr.4 %in% c(1:4) == TRUE | data$open.gdpr.4 %in% c(9:17, 20)
== TRUE)] <- 1

data$gdpr.know.new[((data$open.gdpr.1 %in% c(5:8) == TRUE | data$open.gdpr.1 %in% c(18:19) == TRUE) &
  data$open.gdpr.2 %in% c(1:4) == FALSE &
  data$open.gdpr.2 %in% c(9:16, 20) == FALSE &
  data$open.gdpr.3 %in% c(1:4) == FALSE &
  data$open.gdpr.3 %in% c(9:16, 20) == FALSE &
  data$open.gdpr.4 %in% c(1:4) == FALSE &
  data$open.gdpr.4 %in% c(9:16, 20) == FALSE) |
  data$gdpr.know.num == 0 |
  is.na(data$gdpr.aims) == TRUE |
  data$gdpr.aims == "keine Angabe"] <- 0

# data <- filter(data, is.na(gdpr.know.new)==FALSE) # Remove duplicate entries

# table(data$gdpr.know, data$gdpr.know.new)

# Compare distribution of old and new refined treatment variable
table(data$gdpr.know.num, data$gdpr.know.new)

# Keep old num variable to heck
  data$gdpr.know.num.old <- data$gdpr.know.num
# Replace old treatment variable with new one everywhere
# Important for difference calculations etc.
data$gdpr.know.num <- data$gdpr.know.new
```

```{r create-balanced-panel-data-set}
# Create balanced panel only containing respondents present in both Wave 1 and 2
data.w1w2 <- data %>% filter(pid %in% intersect(data.wave1$pid, data.wave2$pid)) %>% filter(wave.num != 3)
```


```{r create-data-always-newness, eval=FALSE, include=FALSE}
# Identify those who participated in all 3 waves
data.count.pid <- as.data.frame(table(data$pid))
data.count.pid <- filter(data.count.pid, data.count.pid$Freq == 3)
data.always <- filter(data, data$pid %in% data.count.pid$Var1)

# Only new respondents
data.newness <- filter(data, newness == 1)
```


```{r recode-treatment-as-factor}
# Recode treatment variable as factor
data$tr.fac <- factor(data$randomization_gdprinfo_str)
data$tr.fac <- relevel(data$tr.fac, "None")
```

```{r recode-comprehension-variable}
# Recoding text comprehension variable to numerical variable, subsequently dichotomizing the numeric variable into no or little comprehensible (0) and somewhat or very comprehensible (1)
data$text.comprehensible.num <- NA
data$text.comprehensible.num[data$text.comprehensible == "Überhaupt nicht verständlich"] <-
  1
data$text.comprehensible.num[data$text.comprehensible == "Wenig verständlich"] <- 2
data$text.comprehensible.num[data$text.comprehensible == "Etwas verständlich"] <- 3
data$text.comprehensible.num[data$text.comprehensible == "Sehr verständlich"] <- 4

data$text.comprehensible.dich <- NA
data$text.comprehensible.dich[data$text.comprehensible.num == 1 |
  data$text.comprehensible.num == 2] <- 0
data$text.comprehensible.dich[data$text.comprehensible.num == 3 |
  data$text.comprehensible.num == 4] <- 1
```

```{r word-count2}
# Adding speeding variable for whole questionnaire

data$words <- NA

# Wave 1
data$words[data$wave.num==1] <- 583 # Has to be reduced later on

## Differing by scale type (normal or probability)
for (i in 1:nrow(data)){
   if (is.na(data$trust.google[i])==FALSE &
       data$scale.type.wave1.prob[i]==FALSE &  
is.na(data$scale.type.wave1.prob[i])==FALSE){
     data$words[i] <- data$words[i]+35
   }
}

for (i in 1:nrow(data)){
   if (is.na(data$trust.facebook[i])==FALSE &
       data$scale.type.wave1.prob[i]==FALSE &  
is.na(data$scale.type.wave1.prob[i])==FALSE){
     data$words[i] <- data$words[i]+35
   }
}

for (i in 1:nrow(data)){
   if (is.na(data$trust.researchers[i])==FALSE &
       data$scale.type.wave1.prob[i]==FALSE &  
is.na(data$scale.type.wave1.prob[i])==FALSE){
     data$words[i] <- data$words[i]+35
   }
}


for (i in 1:nrow(data)){
   if (is.na(data$trust.google[i])==FALSE &  
data$scale.type.wave1.prob[i]==TRUE &  
is.na(data$scale.type.wave1.prob[i])==FALSE){
     data$words[i] <- data$words[i]+29
   }
}

for (i in 1:nrow(data)){
   if ((is.na(data$trust.facebook[i])==FALSE) &  
data$scale.type.wave1.prob[i]==TRUE &  
is.na(data$scale.type.wave1.prob[i])==FALSE){
     data$words[i] <- data$words[i]+29
   }
}

for (i in 1:nrow(data)){
   if ((is.na(data$trust.researchers[i])==FALSE) &  
data$scale.type.wave1.prob[i]==TRUE &  
is.na(data$scale.type.wave1.prob[i])==FALSE){
     data$words[i] <- data$words[i]+29
   }
}

for (i in 1:nrow(data)){
   if (is.na(data$trust.fedoffstats[i])==FALSE &  
data$scale.type.wave1.prob[i]==TRUE &
       is.na(data$scale.type.wave1.prob[i])==FALSE){
     data$words[i] <- data$words[i]+32
   }
}

for (i in 1:nrow(data)){
   if (is.na(data$trust.fedoffstats[i])==FALSE &  
data$scale.type.wave1.prob[i]==FALSE &
       is.na(data$scale.type.wave1.prob[i])==FALSE){
     data$words[i] <- data$words[i]+38
   }
}


## Follow-up questions
for (i in 1:nrow(data)){
   if (is.na(data$rts4727492[i])==FALSE & data$rts4727492[i]!=0){
     data$words[i] <- data$words[i]+19
   }
}
for (i in 1:nrow(data)){
   if (is.na(data$rts4727550[i])==FALSE & data$rts4727550[i]!=0){
     data$words[i] <- data$words[i]+19
   }
}
for (i in 1:nrow(data)){
   if (is.na(data$rts4727617[i])==FALSE & data$rts4727617[i]!=0){
     data$words[i] <- data$words[i]+19
   }
}
for (i in 1:nrow(data)){
   if (is.na(data$rts4727664[i])==FALSE & data$rts4727664[i]!=0){
     data$words[i] <- data$words[i]+19
   }
}
for (i in 1:nrow(data)){
   if (is.na(data$rts4727802[i])==FALSE & data$rts4727802[i]!=0){
     data$words[i] <- data$words[i]+23
   }
}
for (i in 1:nrow(data)){
   if (is.na(data$rts4727804[i])==FALSE & data$rts4727804[i]!=0){
     data$words[i] <- data$words[i]+23
   }
}
for (i in 1:nrow(data)){
   if (is.na(data$rts4727811[i])==FALSE & data$rts4727811[i]!=0){
     data$words[i] <- data$words[i]+23
   }
}
for (i in 1:nrow(data)){
   if (is.na(data$rts4727836[i])==FALSE & data$rts4727836[i]!=0){
     data$words[i] <- data$words[i]+23
   }
}

## GDPR content question
for (i in 1:nrow(data)){
   if(data$wave.num[i]==1 & data$gdpr.know[i]=="Ja" &
      is.na(data$wave.num[i])==FALSE & is.na(data$gdpr.know[i])==FALSE){
     data$words[i] <- data$words[i]+12
     }
}

## Asked about Facebook and/or Google
for (i in 1:nrow(data)){
   if((data$account.google[i]=="Yes" | data$account.google[i]=="No,  
but before" | data$account.facebook[i]=="Yes"  
|data$account.facebook[i]=="No, but before") & data$wave.num[i]==1 &  
is.na(data$account.facebook[i])==FALSE &
      is.na(data$account.google[i])==FALSE){
        data$words[i] <- data$words[i]+23
      }
}

## Differing by filter group
for (i in 1:nrow(data)){
   if(data$random_id[i] == 1 & is.na(data$random_id[i])==FALSE){
        data$words[i] <- data$words[i]+38
   }
     if(data$random_id[i] == 2 & is.na(data$random_id[i])==FALSE){
        data$words[i] <- data$words[i]+32
      }
}


# Wave 2
data$words[data$wave.num==2] <- 599

## GDPR content question
for (i in 1:nrow(data)){
   if(data$wave.num[i]==2 & data$gdpr.know[i]=="Ja" &
      is.na(data$wave.num[i])==FALSE & is.na(data$gdpr.know[i])==FALSE){
     data$words[i] <- data$words[i]+12
     }
}

## Asked about Facebook and/or Google
for (i in 1:nrow(data)){
   if((data$account.google[i]=="Yes" | data$account.google[i]=="No,  
but before") & is.na(data$account.google[i])==FALSE &  
data$wave.num[i]==2){
        data$words[i] <- data$words[i]+47
      }
}

for (i in 1:nrow(data)){
   if((data$account.facebook[i]=="Yes" |  
data$account.facebook[i]=="No, but before") &  
is.na(data$account.facebook[i])==FALSE & data$wave.num[i]==2){
        data$words[i] <- data$words[i]+47
      }
}

for (i in 1:nrow(data)){
   if((data$account.facebook[i]=="Yes" |  
data$account.facebook[i]=="No, but before" |
       data$account.google[i]=="Yes" | data$account.google[i]=="No,  
but before") & is.na(data$account.facebook[i])==FALSE &  
is.na(data$account.google[i])==FALSE & data$wave.num[i]==2){
        data$words[i] <- data$words[i]+23
      }
}

# Wave 3

## Treatment
data$words[data$wave.num==3] <- 771 # 15 pages
data$words[data$wave.num==3 &
            data$randomization_gdprinfo_fac=="GDPR" &
            data$gdpr.know=="Nein"] <- 783+227
data$words[data$wave.num==3 &
            data$randomization_gdprinfo_fac=="PassRights" &
            data$gdpr.know=="Nein"] <- 783+239


## GDPR content question
for (i in 1:nrow(data)){
   if (data$gdpr.know[i]=="Ja" & data$wave.num[i]==3
       & is.na(data$gdpr.know[i])==FALSE){
     data$words[i] <- data$words[i]+12
   }
}

## Feeling towards sharing data (randomized)
for (i in 1:nrow(data)){
   if (is.na(data$feeling.google.share.research.postexp[i] )==FALSE){
     data$words[i] <- data$words[i]+31
   }
   if (is.na(data$feeling.facebook.share.research.postexp[i])==FALSE){
     data$words[i] <- data$words[i]+31
   }
   if (is.na(data$feeling.fedoffstats.share.research.postexp[i])==FALSE){
     data$words[i] <- data$words[i]+33
   }
   if (is.na(data$feeling.google.share.commercial.postexp[i])==FALSE){
     data$words[i] <- data$words[i]+34
   }
   if (is.na(data$feeling.facebook.share.commercial.postexp[i])==FALSE){
     data$words[i] <- data$words[i]+34
   }
   if (is.na(data$feeling.fedoffstats.share.commercial.postexp[i])==FALSE){
     data$words[i] <- data$words[i]+36
   }
}



# "Allowed" duration: Code whether respondent is speeder or not, given  
# a minimum of 0.3 secs per word: (total word count)*0.3 smaller(/equal)  
# or bigger than seconds spent at survey
# Note: for wave 1, not the total duration is chosen, but the time up  
# to the entering of the WikiSurvey section of the survey, as the  
# inclution of the latter could distort results
data$max.duration <- (data$words)*0.3

for (i in 1:nrow(data)){
   if((data$wave.num[i] == 1 | data$wave.num[i] == 2) &  
is.na(data$wave.num[i])==FALSE){
     data$max.duration[i] <- data$max.duration[i]+14
     }
}

for (i in 1:nrow(data)){
   if((data$wave.num[i] == 3) & is.na(data$wave.num[i])==FALSE){
     data$max.duration[i] <- data$max.duration[i]+20
     }
}

## Additional second for additional page with GDPR content question  
# (all waves)
for (i in 1:nrow(data)){
   if(data$gdpr.know[i]=="Ja" & is.na(data$gdpr.know[i])==FALSE){
     data$max.duration[i] <- data$max.duration[i]+1
     }
}

## Wave 1: Additional seconds for additional pages with follow-up questions
for (i in 1:nrow(data)){
   if (is.na(data$rts4727492[i])==FALSE & data$rts4727492[i]!=0){
     data$max.duration[i] <- data$max.duration[i]+1
   }
}
for (i in 1:nrow(data)){
   if (is.na(data$rts4727550[i])==FALSE & data$rts4727550[i]!=0){
     data$max.duration[i] <- data$max.duration[i]+1
   }
}
for (i in 1:nrow(data)){
   if (is.na(data$rts4727617[i])==FALSE & data$rts4727617[i]!=0){
     data$max.duration[i] <- data$max.duration[i]+1
   }
}
for (i in 1:nrow(data)){
   if (is.na(data$rts4727664[i])==FALSE & data$rts4727664[i]!=0){
     data$max.duration[i] <- data$max.duration[i]+1
   }
}
for (i in 1:nrow(data)){
   if (is.na(data$rts4727802[i])==FALSE & data$rts4727802[i]!=0){
     data$max.duration[i] <- data$max.duration[i]+1
   }
}
for (i in 1:nrow(data)){
   if (is.na(data$rts4727804[i])==FALSE & data$rts4727804[i]!=0){
     data$max.duration[i] <- data$max.duration[i]+1
   }
}
for (i in 1:nrow(data)){
   if (is.na(data$rts4727811[i])==FALSE & data$rts4727811[i]!=0){
     data$max.duration[i] <- data$max.duration[i]+1
   }
}
for (i in 1:nrow(data)){
   if (is.na(data$rts4727836[i])==FALSE & data$rts4727836[i]!=0){
     data$max.duration[i] <- data$max.duration[i]+1
   }
}

## Wave 1: Additional second for (former) Google/Facebook users
for (i in 1:nrow(data)){
   if((data$account.google[i]=="Yes" | data$account.google[i]=="No,  
but before" | data$account.facebook[i]=="Yes"  
|data$account.facebook[i]=="No, but before") & data$wave.num[i]==1 &  
is.na(data$account.facebook[i])==FALSE &
      is.na(data$account.google[i])==FALSE){
        data$max.duration[i] <- data$max.duration[i]+1
      }
}

## Wave 2: Additional second for (former) Google or Facebook users
for (i in 1:nrow(data)){
   if((data$account.google[i]=="Yes" | data$account.google[i]=="No,  
but before" | data$account.facebook[i]=="Yes"  
|data$account.facebook[i]=="No, but before") & data$wave.num[i]==2 &  
is.na(data$account.facebook[i])==FALSE &
      is.na(data$account.google[i])==FALSE){
        data$max.duration[i] <- data$max.duration[i]+1
      }
}

## Wave 2: Additional second for (former) Google/Facebook users
for (i in 1:nrow(data)){
   if((data$account.google[i]=="Yes" | data$account.google[i]=="No,  
but before") & is.na(data$account.google[i])==FALSE &  
data$wave.num[i]==2){
        data$max.duration[i] <- data$max.duration[i]+1
      }
}

for (i in 1:nrow(data)){
   if((data$account.facebook[i]=="Yes" |  
data$account.facebook[i]=="No, but before") &  
is.na(data$account.facebook[i])==FALSE & data$wave.num[i]==2){
        data$max.duration[i] <- data$max.duration[i]+1
      }
}

for (i in 1:nrow(data)){
   if((data$account.facebook[i]=="Yes" |  
data$account.facebook[i]=="No, but before" |
       data$account.google[i]=="Yes" | data$account.google[i]=="No,  
but before") & is.na(data$account.facebook[i])==FALSE &  
is.na(data$account.google[i])==FALSE & data$wave.num[i]==2){
        data$max.duration[i] <- data$max.duration[i]+1
      }
}

## Wave 3: Additional seconds for treated individuals
for (i in 1:nrow(data)){
   if ((data$randomization_gdprinfo_str[i]=="GDPR" |
        data$randomization_gdprinfo_str[i]=="PassRights")
       & is.na(data$randomization_gdprinfo_str[i])==FALSE){
     data$max.duration[i] <- data$max.duration[i]+2
     }
}

# Speeding variable
data$speeding.survey <- NA
for (i in 1:nrow(data)){
   if ((data$max.duration[i]<data$duration[i])==TRUE & is.na(data$duration[i])==FALSE & is.na(data$max.duration[i])==FALSE &
     data$wave.num[i]!=1){
   data$speeding.survey[i] <- 0
   }
   if ((data$max.duration[i]>=data$duration[i])==TRUE
     & is.na(data$duration[i])==FALSE & is.na(data$max.duration[i])==FALSE &
     data$wave.num[i]!=1){
   data$speeding.survey[i] <- 1
   }
   if ((data$max.duration[i]<data$rts4650338[i])==TRUE
     & is.na(data$rts4650338[i])==FALSE & is.na(data$max.duration[i])==FALSE &
     data$wave.num[i]==1){
   data$speeding.survey[i] <- 0
   }
   if ((data$max.duration[i]>=data$rts4650338[i])==TRUE
     & is.na(data$rts4650338[i])==FALSE & is.na(data$max.duration[i])==FALSE &
     data$wave.num[i]==1){
   data$speeding.survey[i] <- 1
   }
}

```

```{r add-changes-to-balanced-data-and-create-figure, echo=TRUE}
# Code below identifies individuals with different treatment status changes
# generates variables ending with diff that contain the change from W1 to W2

data.w1w2 <- data.w1w2 %>%
  arrange(pid, wave) %>%
  group_by(pid) %>%
  mutate(
    # lag needed for calculating difference
    # calculate difference between Wave 1 and 2
    trust.google.diff = trust.google - dplyr::lag(trust.google, n = 1, default = NA),
    trust.facebook.diff = trust.facebook - dplyr::lag(trust.facebook, n = 1, default = NA),
    trust.researchers.diff = trust.researchers - dplyr::lag(trust.researchers, n = 1, default = NA),
    trust.fedoffstats.diff = trust.fedoffstats - dplyr::lag(trust.fedoffstats, n = 1, default = NA),
    gdpr.know.num.lag = dplyr::lag(gdpr.know.num, n = 1, default = NA),
    gdpr.know.num.diff = gdpr.know.num - gdpr.know.num.lag,
    
    # controls
    device.smartphone.dummy.diff = device.smartphone.dummy - dplyr::lag(device.smartphone.dummy, n = 1, default = NA),
    device.handy.dummy.diff = device.handy.dummy - dplyr::lag(device.handy.dummy, n = 1, default = NA),
    device.desktop.dummy.diff = device.desktop.dummy - dplyr::lag(device.desktop.dummy, n = 1, default = NA),
    device.tablet.dummy.diff = device.tablet.dummy - dplyr::lag(device.tablet.dummy, n = 1, default = NA),
    device.ereader.dummy.diff = device.ereader.dummy - dplyr::lag(device.ereader.dummy, n = 1, default = NA),
    female.diff = female - dplyr::lag(female, n = 1, default = NA),
    age.diff = age - dplyr::lag(age, n = 1, default = NA),
    school.educ.num4.diff = school.educ.num4 - dplyr::lag(school.educ.num4, n = 1, default = NA),
    privacy.worry.numeric.diff = privacy.worry.numeric - dplyr::lag(privacy.worry.numeric, n = 1, default = NA),
        trust.generalized.diff = trust.generalized - dplyr::lag(trust.generalized, n = 1, default = NA)
    
    
  )
# Do the same for account.facebook.fac and account.google.fac

```

```{r identify-respondents-with-wrong-treatmentstatus, echo=TRUE}
# Identify individuals that have Change -1 on gdpr.know.num
# PIDs of individuals that have a gdpr.know.num change of -1
respondents.1.to.0 <- data.w1w2 %>% select(pid, gdpr.know.num, gdpr.know.num.lag, gdpr.know.num.diff) %>% filter(gdpr.know.num.diff == -1) %>% distinct(pid) # 37 respondents answered 1 in Wave 1 and 0 in Wave 2

respondents.1.to.1 <- data.w1w2 %>% select(pid, gdpr.know.num, gdpr.know.num.lag, gdpr.know.num.diff) %>% filter(gdpr.know.num.diff == 0, gdpr.know.num == 1) %>% distinct(pid) # 493 respondents answered 1 in Wave 1 and Wave 2

respondents.0.to.1 <- data.w1w2 %>% select(pid, gdpr.know.num, gdpr.know.num.lag, gdpr.know.num.diff) %>% filter(gdpr.know.num.diff == 1) %>% distinct(pid)


respondents.0.to.0 <- data.w1w2 %>% select(pid, gdpr.know.num, gdpr.know.num.lag, gdpr.know.num.diff) %>% filter(gdpr.know.num.diff == 0, gdpr.know.num == 0) %>% distinct(pid) 

# Check single respondents whether only two values remain
# data.w1w2 %>% filter(pid == 401017741) %>% select(pid, wave.num)

# Add dummy for respondents that should be excluded later
  data.w1w2$respondents.wrong.status <- 0
  data.w1w2$respondents.wrong.status[data.w1w2$pid %in% respondents.1.to.0$pid] <- 1  
  data.w1w2$respondents.wrong.status[data.w1w2$pid %in% respondents.1.to.1$pid] <- 1
```
  
```{r exclude-respondents-with-wrong-treatmentstatus, echo=TRUE}
# Exclude respondents with wrong treatement status
  data.w1w2 <- data.w1w2 %>% filter(respondents.wrong.status==0)
```

In our analysis of the panel data we excluded respondents who indicated having heard of the GDPR in Wave 1 (=1) and not having heard of the GDPR in Wave 2 (=0) (`r nrow(respondents.1.to.0)` respondents). We also excluded respondents that indicated having heard of the GDPR in both Wave 1 and 2 (`r nrow(respondents.1.to.1)` respondents). Accordingly, our “treatment” groups consist of respondents that indicate having not heard of GDPR in Wave 1 (= 0) and having heard of GDPR in Wave 2 (= 1) (`r nrow(respondents.0.to.1)` respondents) and our comparison/control group consists of respondents that indicate having not heard of GDPR in Wave 1 (= 0) and having not heard of GDPR in Wave 2 (= 0) (`r nrow(respondents.0.to.0)` respondents).

# Treatment status over time
We can also have a look at the distribution of the treatment status (0,1) across the three waves and the corresponding means in the outcome variables. Table is stored in the folder.

```{r treatmentstatus-acrosstime}
# This is not finished...
by.group <- data %>%
  filter(pid %in% intersect(intersect(data.wave1$pid, data.wave2$pid), data.wave3$pid)) %>%
  filter(!is.na(gdpr.know)) %>%
  group_by(pid) %>%
  mutate(gdpr.know.wave = paste0(gdpr.know, collapse = "")) %>%
  ungroup() %>%
  group_by(gdpr.know, wave, gdpr.know.wave) %>%
  select(
    trust.google,
    trust.facebook,
    trust.fedoffstats,
    trust.researchers
  ) %>%
  summarise_all(funs(mean(., na.rm = TRUE), n())) %>%
  ungroup() %>%
  mutate_if(is.numeric, round, 2) %>%
  arrange(wave, gdpr.know.wave)

table <- kable(by.group,
  row.names = FALSE,
  caption = "Treatment status over time", format = "html", booktabs = T
) %>%
  kable_styling(full_width = T, font_size = 12)

save_kable(table, "./Treatment status over time.html", self_contained = TRUE)
```
<br><br><br>

# Inter-coder reliability

```{r}
data_open_coded <- read_delim("./input_raw/data_subset_open_200_coded.csv", delim = ";") %>% 
  mutate(pid_wave = paste(pid, wave, sep="")) %>%
       rename(rater2 = gdpr.aims.dummy)
data <- data  %>% mutate(pid_wave = paste(pid, wave, sep=""))
intersects <- intersect(data$pid_wave, data_open_coded$pid_wave)
data_check <- data %>% 
       filter(pid_wave %in% intersects) %>%
       select(pid_wave, gdpr.know, gdpr.aims, gdpr.know.num, gdpr.know.new) %>%
       rename(rater1 = gdpr.know.new)
data_check <- left_join(data_check, 
                        data_open_coded,
                        by = "pid_wave"
                        )

cor(data_check$rater1, data_check$rater2)


psych::cohen.kappa(x=cbind(data_check$rater1, 
                           data_check$rater2))
irr::kappa2(cbind(data_check$rater1, 
                           data_check$rater2))

data_check <- data_check %>%
  mutate(same_code = rater1 == rater2)

table(data_check$same_code)


```

<br><br><br>




# Clean and anonymize data (reproduction data)
* This part was not used in the original data analysis but is used to created the reproduction data files that are stored in `"./input/data.RData"`.

```{r }
data <- data %>% select(!contains("rts4"))
data.w1w2 <- data.w1w2 %>% select(!contains("rts4"))
data.wave1 <- data.wave1 %>% select(!contains("rts4"))
data.wave2 <- data.wave2 %>% select(!contains("rts4"))
data.wave3 <- data.wave3 %>% select(!contains("rts4"))
```


* Delete 10 open-ended responses that may threaten anonymity (only for reproduction data not data used in publication)
* Delete repondents younger than 20 and older than 70 for anonymity reasons
```{r echo=TRUE}
# IDs with problematic, potentially non-anonymous open-ended answers
# This was identified manually
IDs <- c(487441448, 493654349, 451261248,
         483763640, 427849441, 473257843, 
         401683441, 441124447, 413907744,
         471269143)


# Exclude critical open-ended responses and
# check number of observations before/after
nrow(data)
data <- data %>% filter(!pid %in% IDs)
nrow(data)

nrow(data.w1w2)
data.w1w2 <- data.w1w2 %>% filter(!pid %in% IDs)
nrow(data.w1w2)

nrow(data.wave1)
data.wave1 <- data.wave1 %>% filter(!pid %in% IDs)
nrow(data.wave1)

nrow(data.wave2)
data.wave2 <- data.wave2 %>% filter(!pid %in% IDs)
nrow(data.wave2)

nrow(data.wave3)
data.wave3 <- data.wave3 %>% filter(!pid %in% IDs)
nrow(data.wave3)


# Exclude unnecessary variables
variables_keep1 <- c("pid", "wave", "wave.num", "trust.google", "trust.facebook", "trust.researchers", "trust.fedoffstats", "gdpr.know", "gdpr.aims", "gdpr.know.num","gdpr.know.num.old", "gdpr.know.new", "female", "age.cat.fac", "school.educ.num4.fact", "privacy.worry.numeric", "account.facebook.fac", "account.google.fac", "device.smartphone.dummy", "device.handy.dummy", "device.desktop.dummy", "device.tablet.dummy", "device.ereader.dummy", "trust.generalized", "number.devices", "number.accounts", "abitur", "trust.google.change", "trust.facebook.change", "trust.generalized.change", "tr.fac",
                     "text.comprehensible.dich", "speeding", "randomization_gdprinfo_str", "age")

variables_keep2 <- c("pid", "wave", "wave.num", "trust.google", "trust.facebook", "trust.researchers", "trust.fedoffstats", "gdpr.know", "gdpr.aims", "gdpr.know.num","gdpr.know.num.old", "gdpr.know.new", "female", "age.cat.fac", "school.educ.num4.fact", "privacy.worry.numeric", "account.facebook.fac", "account.google.fac", "device.smartphone.dummy", "device.handy.dummy", "device.desktop.dummy", "device.tablet.dummy", "device.ereader.dummy", "trust.generalized", "number.devices", "number.accounts", "abitur", "trust.google.diff", "trust.facebook.diff", "trust.fedoffstats.diff", "trust.researchers.diff", "trust.generalized.diff", "gdpr.know.num.diff", "trust.google.change", "trust.facebook.change", "trust.generalized.change", "speeding", "randomization_gdprinfo_str", "age", "privacy.worry.numeric.diff")

# Loop
# data_2 <- data
# data.w1w2_2 <- data.w1w2
# 
# data <- data_2
# data.w1w2 <- data.w1w2_2

# Delete cells N < 5
nrow(data)
data <- data %>% select(all_of(variables_keep1)) %>% 
  filter(age < 100) %>% 
  filter(age > 19)
nrow(data)

nrow(data.w1w2)
data.w1w2 <- data.w1w2 %>% select(all_of(variables_keep2)) %>% 
  filter(age < 100) %>% 
  filter(age > 19)
nrow(data.w1w2)



```



# Save reproduction data

```{r save-data}
save(data, data.w1w2, #data.wave1, data.wave2, data.wave3, 
     file = "./data.RData")
```







# Open-ended responses
* Generate data-frame with subset for second coder

```{r load-data-2}
data_subset_open_6258 <- data %>% dplyr::select(pid, wave, gdpr.know, gdpr.aims)
data_subset_open_200 <- data_subset_open_6258 %>% 
                          filter(gdpr.know == "Ja") %>%
                          sample_n(200)
data_subset_open_100 <- data_subset_open_6258 %>% 
                          filter(gdpr.know == "Ja") %>%
                          sample_n(100)
#write_excel_csv(data_subset_open_200, "./input/data_subset_open_200.csv")
#write_excel_csv(data_subset_open_100, "./input/data_subset_open_100.csv")
```

* Get statistics on the wordcount of those open-ended responses.

```{r word-count}

data <- data %>% mutate(word_count = stringi::stri_count_words(gdpr.aims))

# How many open-ended responses?
x <- data %>% 
  filter(gdpr.know == "Ja")
table(is.na(x$gdpr.aims))
table(x$gdpr.aims=="")

table(is.na(data$gdpr.aims[data$gdpr.know == "Ja"]))

# Summary stats
summary(data$word_count)

# Check short answers
#data %>% filter(word_count <= 3) %>% dplyr::select(gdpr.aims)

# Check long answers
#data %>% filter(word_count >= 20) %>% dplyr::select(gdpr.aims)

```




